Import & UDFs¶

Importing libraries¶

In [ ]:
import pandas as pd 
pd.set_option("display.max_columns", None)
import numpy as np

import time
import os

import caffeine    

from tqdm import tqdm

import seaborn as sns 
sns.set_theme(style="darkgrid", palette="Set1")

import klib

import plotly.graph_objects as go
import plotly.express as px
import plotly.figure_factory as ff
from plotly.colors import n_colors
import plotly.offline as pyo

import warnings
warnings.filterwarnings('ignore')

import folium

import re

import chart_studio
import chart_studio.plotly as py

UDFs¶

In [ ]:
# This is to diplay nicely pandas dataframes
def table (df, row_size=40, line_break=False):
    if line_break == True:
        for col in df.columns:
            df[col] = df[col].apply(lambda x: x.replace(" ", "<br>") if type(x) == str else x)
    
    table_data = [df.columns] + df.values.tolist()[:10]

    
    return ff.create_table(table_data)

# Return a capitalized version of the dataframe
def capitalize(df):
    for col in df.columns:
        try:df[col] = df[col].str.title()
        except: ...
            
    return df

Import cleaned data¶

In [ ]:
df_games = pd.read_csv("data/clean_games.csv")
df_events = pd.read_csv("data/clean_events.csv")
df_players = pd.read_csv("data/clean_players.csv")
df_players_info = pd.read_csv("data/clean_players_info.csv")
df_lineup = pd.read_csv("data/clean_lineup.csv")
# df_stats = pd.read_csv("data/clean_stats.csv")

Feature engineering & Insights¶

Players stats¶

Playing time¶

Calculating the time spent

There is a rule that if a player leaves the pitch (subtitution or card) he can not return so that gives us some defined senarios:

IMG-3097

In [ ]:
# Creating substition of df_events, we don't need all the columns
sub_events = df_events[df_events["description"]\
                       .isin(["sub_on", "sub_off", "red_card", "2y_red_card"])]\
                       [["type", "description", "player_id", "match_id", "clock_label"]]

# Creating substition of df_players, we don't need all the columns
sub_players = df_players[["player_name", "player_id", "substitute", "match_id"]]


sub_merged = sub_players.merge(sub_events, on=["player_id", "match_id"], how="left").drop_duplicates()


# If player not substitute (starting game on the pitch, not on bench)
sub_merged["time_pitch"] = sub_merged[["clock_label", "description", "substitute"]]\
    .apply(lambda x: (90 if pd.isnull(x["description"]) else x["clock_label"]) if x["substitute"] == 0 else 0,
           axis=1)


sub_merged["time_pitch"] = sub_merged[["clock_label", "description", "substitute", "time_pitch"]]\
    .apply(lambda x: (90-x["clock_label"] if x["description"]=="sub_on" else x["clock_label"]-90)\
                     if pd.notnull(x["description"]) and x["substitute"] == 1 else x["time_pitch"],
           axis=1)

Merging time played on df_players

In [ ]:
# Addding time for each player/game as some players subed on and red_card or sub off
sub_merged = sub_merged.groupby(["player_name", "match_id", "player_id"])\
                       .sum()["time_pitch"].sort_values(ascending=False).to_frame().reset_index()


df_players = df_players.merge(sub_merged[["player_id", "match_id", "time_pitch"]],
                              on=["player_id", "match_id"], 
                              how="left")


df_players.head()
Out[ ]:
match_position team_id info_loan age substitute substit ute captain player_number full_position match_id player_name player_id nateam birth_date birth_country_isoCode birth_country normal_position time_pitch
0 d 28 NaN 29.557988 0.0 NaN False NaN defender 3405 stefan schnoor 1545 germany 1971-04-18 de germany d 90.0
1 d 28 NaN 29.436949 0.0 NaN False NaN defender 3348 stefan schnoor 1545 germany 1971-04-18 de germany d 90.0
2 d 28 NaN 28.861989 NaN 1.0 False NaN defender 3162 stefan schnoor 1545 germany 1971-04-18 de germany d 0.0
3 d 28 NaN 28.919485 NaN 1.0 False NaN defender 3189 stefan schnoor 1545 germany 1971-04-18 de germany d 0.0
4 d 28 NaN 28.465563 0.0 NaN False NaN defender 3002 stefan schnoor 1545 germany 1971-04-18 de germany d 90.0

Player number¶

Is there a number to position pattern in football? Do players wear a certain number if they play a certain position?

In [ ]:
# Replacing letter by real position
replacement = {"g": "goalkeeper",
              "d": "defender", 
              "m": "midfielder",
              "f": "forward"}

df_players["normal_position"].replace(replacement, inplace=True)
df_players_info["normal_position"].replace(replacement, inplace=True)


# Creating dictionary with the noral position mode fo each number
player_num_position = {int(i):df_players[df_players.player_number==i]["normal_position"].mode()[0] for i in df_players["player_number"].unique() if pd.isnull(i)==False}


# Replacing numbers 99 as missing values
df_players["player_number"] = df_players["player_number"]\
                              .apply(lambda x: np.nan if x == 99 or x == -1 or pd.isnull(x) else int(x))

# # Number who used in the games more than 1,000 time.
sub_df = df_players["player_number"].value_counts().sort_values(ascending=False)
sub_df = sub_df[sub_df > 1000]
sub_df.index = list(map(lambda x: int(x), sub_df.index))
sub_df = sub_df.to_frame().reset_index()
sub_df["position"] = sub_df["index"].apply(lambda x: player_num_position[x])


players_number = px.bar(sub_df, x="index", y="player_number", color="position", 
                        title="Number of time the most used numbers have been used (>1,000)")

players_number.update_xaxes(title_text="Player's number")
players_number.update_yaxes(title_text="Number of games it was used in")


pyo.init_notebook_mode()
pyo.iplot(players_number, filename = 'Number of time the most used numbers have been used (>1,000)')
# players_number.show()

Players goals¶

In [ ]:
# Creating a substitute df to merge 
df_events_players = df_events.merge(df_players[["player_name", "player_id", "match_id"]], 
                                    how="left")

# Creating df with all the scorers
df_scorers = df_events_players[df_events_players["type"]=="goal"].groupby(["player_name", "player_id"])\
                                     .count()\
                                     .sort_values(by="match_id", ascending=False)\
                                     .reset_index()\
                                     [["player_name", "match_id", "player_id"]]\
                                     .rename(columns={"match_id":"goals"})
                
# Creating df with all the assisters
df_assisters = df_events_players[df_events_players.assist_id.notna()].drop_duplicates(subset=["clock_label", "assist_id"])\
                                                         .groupby(["player_name", "player_id"])\
                                                         .count()\
                                                         .reset_index()\
                                                         [["match_id", "player_id"]]\
                                                         .rename(columns={"match_id":"assists"})    

# Adding the number of assists to the scorers
df_scorers = df_scorers.merge(df_assisters, 
                                how="left",
                                on="player_id").fillna(0)

# Adding players infos to df_scorers 
df_scorers = df_scorers.merge(df_players_info[["birth_country", "player_id", "normal_position"]],
                             how="left")
    
    

df_scorers.index = list(range(1, len(df_scorers)+1))
df_scorers["assists"] = df_scorers["assists"].astype(int)


# Creating a new df of the scorers with at least 100 goals
df_scorers_100_goals = df_scorers[df_scorers["goals"] > 100]

more_goals_100 = px.bar(capitalize(df_scorers_100_goals), 
             x='player_name', y='goals',
             hover_data=["goals", "assists", "birth_country", "normal_position"], 
             color='goals',
             height=500,
             color_continuous_scale=px.colors.sequential.Blues[4:], 
             title = "Players who scored more than 100 goals", 
             template="plotly_white", range_y=[70, 260])


more_goals_100.update_xaxes(title_text="Player's number", tickangle=-90)
more_goals_100.update_yaxes(title_text="Number of games it was used in")
more_goals_100.update(layout_coloraxis_showscale=False)

# py.plot(more_goals_100)

more_goals_100.show()

Playing time vs goals scored¶

Most effective player (aka the one that had the less number of minutes per goal)

In [ ]:
# Grouping time to have total time per player
df_times = df_players.groupby(["player_id", "normal_position"])\
                     .sum()\
                     .sort_values(by="time_pitch", ascending=False)\
                     .reset_index()\
                     [["player_id", "time_pitch"]]


# Merging scorers and their time together
df_goals_vs_time = df_scorers.merge(df_times, how="left", on="player_id").dropna()


# Getting minutes per goal
df_goals_vs_time["goal_every"] = round(df_goals_vs_time["time_pitch"] / df_goals_vs_time["goals"], 2)


df_goals_vs_time.head(5)
Out[ ]:
player_name goals player_id assists birth_country normal_position time_pitch goal_every
0 alan shearer 260 89.0 136 england forward 37881.0 145.70
1 wayne rooney 208 2064.0 118 england forward 36237.0 174.22
2 andrew cole 187 725.0 112 england forward 30139.0 161.17
3 sergio agüero 184 4328.0 110 argentina forward 18881.0 102.61
4 frank lampard 177 800.0 77 england midfielder 47744.0 269.74
In [ ]:
# subsetting to keep players who played at least 1000 minutes.
df_goals_vs_time = df_goals_vs_time[df_goals_vs_time["time_pitch"]>1000]

# Getting the top 20 players
df = df_goals_vs_time.sort_values("goal_every").reset_index().loc[:20].round(2)


fig = px.bar(df.sort_values('goal_every', ascending=False), 
             x='goal_every', y='player_name',
             hover_data=["goals", "time_pitch", "normal_position", "goal_every"], 
             color='goal_every',
             range_x=[100, 160],
             height=500,
            color_continuous_scale=px.colors.sequential.Blues[3:][::-1],
            template="plotly_white", 
            title="Most effective players (lower is better)", barmode='stack')


fig.update_xaxes(title_text='Goals every (in minutes)')
fig.update_yaxes(title_text='Player name')
fig.update(layout_coloraxis_showscale=False)

# py.plot(fig)

fig.show()

Most effective player

In [ ]:
goal_time = px.scatter(capitalize(df_goals_vs_time), y="goals", x="time_pitch", 
           color="normal_position", hover_name="player_name", 
               title="Minutes played vs Goals scores vs Player position", range_y=[-20, 270])


goal_time.update_xaxes(title_text='Time on pitch (in minutes)')
goal_time.update_yaxes(title_text='Number of goals inn career')
goal_time.update_layout(legend_title="Player's position")


goal_time.add_shape(
    dict(type="circle", x0=-100, y0=-50, x1=20000, y1=100), row=1, col="all", line_color="orange")



# py.plot(goal_time)

goal_time.show()

# df_goals_vs_time.isna().sum()

This graph is what we could expect from it. Most players concentrated in what seems to be a triangle (0, 0), (12.5k, 50) and (20k, 0). There is a clear color orders for the number of goals scored: purple for goalkeepers, green for defenders, red for midfielders and then blue for forwards. It's logical and makes sense that the closer your position is from the opponent's goal the more likely you are to score a goal. This change if a player played a lot. For example, David Beckham (red point at (21k, 62)) scored more goals than most forward with less than 20,000 minutes played as he played way more than them (and was very good too).

The three most extreme points are:

  • Wayne Rooney: 38,000 minutes played for 208 goals
  • Frank Lampard: 50,000 minutes played for 177 goals
  • Gareth Bary: 55,000 minutes played for 53 goals

You may not realize but 40,000 minutes played is the equivalent of playing every minute of every Premier League game for more than 11 seasons. Mind blowing.

TALK ABOUT HENRY, AGUERO, etc...

In [ ]:
fig = px.scatter_3d(df_goals_vs_time, x='goals', y='time_pitch', z='assists',
                    color='normal_position', 
                    labels={"time_pitch":"Minutes played", 
                            "assists":"Number of assists",
                            "goals":"Number of goals"}, 
                    hover_data=df_goals_vs_time.columns,
                   title="Minutes played vs goals scores vs assists vs player's position")


fig.update_layout(legend_title="Player's position")
fig.show()

Goals vs assists over the years

In [ ]:
sub_players = df_players[["match_id", "player_id", "time_pitch"]]
sub_events = df_events[df_events.type=="goal"][["player_id", "assist_id", "match_id"]]


sub_df = sub_events.set_index("match_id")\
            .unstack().reset_index()\
            .rename(columns={"level_0":"type", 0:"player_id"})\
            .dropna(subset=["player_id"])


sub_df["goals"] = sub_df["type"].apply(lambda x: 1 if x == "player_id" else 0)
sub_df["assists"] = sub_df["type"].apply(lambda x: 1 if x == "assist_id" else 0)
sub_df.drop("type", axis=1, inplace=True)



sub_df = sub_df.groupby(["match_id", "player_id"])\
               .sum()\
               .reset_index()


sub_df = sub_df.merge(df_players_info[["player_id", "player_name", "normal_position"]], 
                        how="left", 
                        on="player_id")


match_id_season = dict(zip(df_games.match_id, df_games.season))
sub_df["season"] = sub_df["match_id"].apply(lambda x: match_id_season[x])




sub_df = sub_df.merge(df_players[["match_id", "player_id", "time_pitch"]], 
                     how = "left", 
                     on=["match_id", "player_id"])









sub_df = sub_df.groupby(["player_name", "normal_position", "season"]).sum().reset_index().sort_values("season")



color=["#636efb", "#ef553b", "#00cc96", "#ab63fa"]
diff = ["forward", "midfielder", "defender", "goalkeeper"]
colors = {}
for index, x in enumerate(color): 
    colors[diff[index]] = f"rgb{str(px.colors.hex_to_rgb(x))}"


assist_goals = px.scatter(sub_df, 
                          y="goals", x="assists", color="normal_position", 
                          animation_frame="season", hover_name="player_name", size="time_pitch", 
                          color_discrete_map=colors, range_y=[0, 40], 
                         title = "Goals vs assists per season")

assist_goals.update_xaxes(title_text="Assists")
assist_goals.update_yaxes(title_text="Goals")
assist_goals.update_layout(legend_title="Player's position")


# py.plot(assist_goals)
assist_goals

Best goalkeeper

In [ ]:
home = df_games[["home_team_id", "away_score", "match_id", "season", "home_team"]].values.tolist()
away = df_games[["away_team_id", "home_score", "match_id", "season", "away_team"]].values.tolist()

df_team_score = home + away


df_team_score = pd.DataFrame(df_team_score, columns = ["team_id", "score", "match_id", "season", "team"])


clean_sheet = df_team_score[df_team_score.score==0]

clean_sheet[(clean_sheet.season=="2004/05") & (clean_sheet.team_id==4)]

goalkeepers = df_players[(df_players.normal_position=="goalkeeper") &\
                         (df_players.substitute==0)]\
                        [["team_id", "match_id", "player_name"]]
clean_sheet = clean_sheet.merge(goalkeepers, how="left", on=["team_id", "match_id"])

clean_sheet = clean_sheet.groupby(["season", "player_name", "team"]).count().reset_index()\
                         .sort_values(["season", "team_id"], ascending=[1, 0])


clean_sheet = clean_sheet[clean_sheet.team_id>9]



fig = px.bar(capitalize(clean_sheet), y="team_id", x="player_name", color = "team_id", 
       color_continuous_scale=px.colors.sequential.BuPu[4:],
      animation_frame="season", hover_name="player_name", range_y=[0, 25], 
            title="Cleansheets per season (at leat 10)", hover_data=["team"])


fig.update_xaxes(title_text=None, tickangle=40)
fig.update_yaxes(title_text="Cleansheet count")
fig.update(layout_coloraxis_showscale=False)

# py.plot(fig)
fig

Avg minutes for goal per position

In [ ]:
sub_df = df_goals_vs_time.groupby("normal_position").mean()["goal_every"]\
                         .round(0)\
                         .sort_values()

for x in sub_df.index:
    print(f"On average, a {x} scores a goal every {int(sub_df[x]):,} minutes.")
On average, a Forward scores a goal every 340 minutes.
On average, a F scores a goal every 1,228 minutes.
On average, a Midfielder scores a goal every 1,240 minutes.
On average, a D scores a goal every 1,455 minutes.
On average, a Defender scores a goal every 3,049 minutes.
On average, a Goalkeeper scores a goal every 28,048 minutes.

Duos: assister & scorer¶

Who's the best duo ever? Which combinaison of two players passed anbd s

In [ ]:
players = {}
sus = df_players.drop_duplicates(subset=["player_name", "player_id"]).reset_index(drop=True)
for x in tqdm(range(len(sus))):
    a = sus.loc[x , ["player_name", "player_id"]].to_list()
    players[a[1]] = a[0]

df = df_events[["player_id", "assist_id"]].dropna()


df = df.replace(players)

df["combi"] = df["player_id"].astype(str) + " - " + df["assist_id"].astype(str)


df_combi = pd.DataFrame()

for x in tqdm(df["combi"]):
    df_combi.loc[x, "count"] = df["combi"].to_list().count(x)
    
df_combi = df_combi.reset_index().sort_values("count", ascending=False).head(20)
100%|██████████| 4863/4863 [00:00<00:00, 6364.89it/s]
100%|██████████| 20337/20337 [00:08<00:00, 2291.37it/s]
In [ ]:
df_combi["count"] = df_combi["count"].astype(int)
df_combi["index"] = df_combi["index"].str.title()
table(df_combi.rename(columns={"index":"Players: scorer - assist"}))

Players origin¶

In [ ]:
best_scorers_country = df_scorers.merge(df_players_info[["player_id", "birth_country"]])\
                                 .sort_values("goals", ascending=False)\
                                 .drop_duplicates(subset=["birth_country"], keep="first")\
                                 [["player_name", "goals", "birth_country"]]

best_assisters_country = df_scorers.merge(df_players_info[["player_id", "birth_country"]])\
                                 .sort_values("goals", ascending=False)\
                                 .drop_duplicates(subset=["birth_country"], keep="first")\
                                 [["player_name", "assists", "birth_country"]]   
    
best_assisters_country
Out[ ]:
player_name assists birth_country
0 alan shearer 136 england
3 sergio agüero 110 argentina
5 thierry henry 99 france
12 robin van persie 78 netherlands
13 jimmy floyd hasselbaink 73 suriname
... ... ... ...
2198 ragnar klavan 1 estonia
2232 mbwana samatta 0 tanzania
1973 emmanuel rivière 1 martinique
1961 domingos quina 1 guinea-bissau
2150 paul tisdale 1 malta

109 rows × 3 columns

In [ ]:
df_scorers
Out[ ]:
player_name goals player_id assists birth_country normal_position
1 alan shearer 260 89.0 136 england forward
2 wayne rooney 208 2064.0 118 england forward
3 andrew cole 187 725.0 112 england forward
4 sergio agüero 184 4328.0 110 argentina forward
5 frank lampard 177 800.0 77 england midfielder
... ... ... ... ... ... ...
2452 jelle van damme 1 2795.0 1 belgium defender
2453 jefferson montero 1 10518.0 1 ecuador midfielder
2454 roger espinoza 1 4699.0 0 honduras midfielder
2455 jeff whitley 1 1059.0 1 northern ireland midfielder
2456 özalan alpay 1 1839.0 0 turkey defender

2456 rows × 6 columns

In [ ]:
# creating gradient color
gradient = "E63946 E63946 F26430 CDEAE5 A8DADC 90C3CD 77ABBD 92AFD7 1D3557"
gradient = ["#"+x.lower() for x in gradient.split(" ")]

# gradient = (gradient*10)[:15]
gradient

gradient


# Renaming countries to make them fit with choropleth's country list
df = df_players_info.copy()

not_in = ['faroe islands', 'cape verde', 'curacao', np.nan, 'montserrat', 'guadeloupe', 'bermuda', "reunion",
          'barbados', 'martinique', 'malta', 'grenada', 'antigua & barbuda', "st. kitts & nevis"]

country_replace = {'united states': 'united states of america',
                  'congo - kinshasa': "congo", 
                  'wales':"United Kingdom", 
                  'scotland' : "United Kingdom",
                  'northern ireland' : "United Kingdom", 
                  'cote d’ivoire' :"Côte d'Ivoire", 
                  'central african republic': "Central African Rep.", 
                  'england': "United Kingdom", 
                  'bosnia & herzegovina': "Bosnia and Herz.",
                  "trinidad & tobago": "Trinidad and Tobago",
                  'north macedonia': "Macedonia", 
                  'yugoslavia': "Serbia"}


df["birth_country"] = df["birth_country"].replace(country_replace).to_list()



best_scorers_country = df_scorers.merge(df_players_info[["player_id", "birth_country"]])\
                                 .sort_values("goals", ascending=False)\
                                 .drop_duplicates(subset=["birth_country"], keep="first")\
                                 [["player_name", "goals", "birth_country"]]\
                                 .rename(columns={"player_name":"Best scorer"})

best_assisters_country = df_scorers.merge(df_players_info[["player_id", "birth_country"]])\
                                 .sort_values("goals", ascending=False)\
                                 .drop_duplicates(subset=["birth_country"], keep="first")\
                                 [["player_name", "assists", "birth_country"]]\
                                 .rename(columns={"player_name":"Best assister"})

best_scorers_country["birth_country"] = best_scorers_country["birth_country"].replace(country_replace).to_list()
best_assisters_country["birth_country"] = best_assisters_country["birth_country"].replace(country_replace).to_list()


# display(best_scorers_country)
# print(uhij)



country_count = {i:df["birth_country"].to_list().count(i) for i in df["birth_country"].unique()}
country_count = pd.DataFrame(country_count, index=["count"]).T.reset_index()\
                                                            .rename(columns={"index":"birth_country"})

country_count = country_count.merge(best_scorers_country, 
                   how="left").merge(best_assisters_country, 
                                     how="left")


country_count["birth_country"] = country_count["birth_country"].str.title()


country_count = country_count.rename(columns={"birth_country":"Country", 
                                             "goals":"Best scorer goals", 
                                             "assists":"Best assister assists", 
                                             "count": "Number of player from"})

country_count


world = px.choropleth(capitalize(country_count), locations="Country", locationmode="country names",
                    color="Number of player from", # lifeExp is a column of gapminder
                    hover_name="Country", # column to add to hover information
                    color_continuous_scale=gradient[::-1], 
                   range_color=[10, 500], 
                   title="Origin of the players", hover_data=["Best scorer", "Best scorer goals", 
                                                              "Best assister", "Best assister assists"])

# py.plot(world)
world.show()

It's not a surprise that most players come from The UK. The second country being France, Spain.

I'm quite surprised that that many players come from the US as it's not a country that you think may generate male football players (female are more into football than male).

Do players play for the country they are born in? (check not null values)

In [ ]:
# Players who played for the country they were born in 
##########
sub_df = df_players[["nateam", "birth_country", "player_id"]].drop_duplicates().reset_index(drop=True)

lst = [1 for x in range(len(sub_df)) if sub_df.loc[x, "birth_country"]==sub_df.loc[x, "nateam"]]

same = lst.count(1)
diff = len(df_players) - same

print(f"Among {len(sub_df):,} players, {same:,} played for the country they were born in.")
Among 4,863 players, 4,474 played for the country they were born in.

Quick stats¶

Here are some stats that deverve to be told but don't really deserve a big part

Best substitute Most used player who started on the bench

In [ ]:
df = df_players.merge(df_events[df_events["description"]=="sub_on"][["description", "player_id", "match_id"]], 
                      how="right")

df = df.groupby(["player_name", "player_id"])\
       .sum()["time_pitch"]\
       .to_frame()\
       .reset_index()\
       .sort_values("time_pitch", ascending=False)



sub_players = df_players.groupby(["player_name", "player_id"])\
                        .sum()["time_pitch"]\
                        .to_frame()\
                        .reset_index()\
                        .sort_values("time_pitch", ascending=False)

sub_players.rename(columns={"time_pitch": "total_time_pitch"}, inplace=True)


df = df.merge(sub_players, on=["player_name", "player_id"], how="left")

df["ratio"] = (df["total_time_pitch"] / df["time_pitch"]).round(0)
df.head()
# sub_players
Out[ ]:
player_name player_id time_pitch total_time_pitch ratio
0 aaron connolly 21653 0.0 1630.0 inf
1 michael meaker 459 0.0 1732.0 inf
2 michael oakes 34 0.0 6264.0 inf
3 michael obafemi 21532 0.0 590.0 inf
4 michael owen 1208 0.0 21638.0 inf
In [ ]:
df = df_players.merge(df_events[df_events["description"]=="sub_oon"][["description", "player_id", "match_id"]], 
                      how="right")

Enemy of own club: who scored the most own goals?

In [ ]:
player_id = df_events[df_events["description"]=="own_goal"]["player_id"].value_counts().index[0]
count = df_events[df_events["description"]=="own_goal"]["player_id"].value_counts().values[0]
player_name = df_players[df_players["player_id"]==player_id]["player_name"].values[0]

print(f"{player_name.title()} scored {count} own-goals during his career in Prmier League.")
Richard Dunne scored 10 own-goals during his career in Prmier League.

Youngest player to score x goals¶

In [ ]:
df_scorers = df_events[df_events["type"]=="goal"][["player_id", "match_id"]]\
                                    .merge(df_players[["player_name", "age", "player_id", "match_id"]])

df_scorers = df_scorers.sort_values(["player_name", "age"])


# df_scorers["player_name"].value_counts()[0]



df_goal_age = pd.DataFrame(columns = df_scorers["player_name"].unique(),
                           index = list(range(1, df_scorers["player_name"].value_counts()[0]+1)))




for index, player in enumerate(tqdm(df_scorers["player_name"])):
    player_goals = df_scorers[df_scorers["player_name"]==player]
    
    df_goal_age.loc[list(range(1, len(player_goals)+1)), player] = player_goals["age"].to_list()
    
#     display(player_goals, df_goal_age)
#     break
df_goal_age.fillna(100, inplace=True)
100%|██████████| 28915/28915 [00:30<00:00, 935.46it/s]
In [ ]:
df_scorers_record = pd.DataFrame(columns=["player_name", "age", "count", "goal"])

goals = [1] + list(range(0, 250, 25))[1:]

for goal in goals: 
    for player, age in df_goal_age.loc[goal, ].sort_values().items():
        df_scorers_record.loc[len(df_scorers_record)] = [player, age, count, goal]
        break

df_scorers_record = df_scorers_record[df_scorers_record["age"]!=100]
df_scorers_record["age_num"] = df_scorers_record["age"].apply(lambda x:\
                               str(int(x)) + " years, " + str(int((x-int(x))*365)) + "days")



goal_age = px.bar(capitalize(df_scorers_record), x="goal", y="age", color="player_name", barmode='group', 
                  hover_name="player_name", hover_data=["age_num"],
             title="Youngest players to score X goals")

goal_age.update_xaxes(title_text='Goals count')
goal_age.update_yaxes(title_text='Player age')
goal_age.update_layout(legend_title="Player name")

Team stats¶

Team ranking¶

In [ ]:
df_games["season"].unique()
Out[ ]:
array(['2013/14', '2007/08', '1995/96', '2010/11', '2003/04', '1997/98',
       '2005/06', '2001/02', '2018/19', '1993/94', '2008/09', '2019/20',
       '2015/16', '1999/00', '2004/05', '2017/18', '2012/13', '2011/12',
       '2016/17', '1994/95', '2020/21', '1992/93', '1996/97', '2009/10',
       '2014/15', '2002/03', '1998/99', '2006/07', '2000/01'],
      dtype=object)
In [ ]:
qual_releg = {}

# Looping through every table on the wiki page of the season
for season in tqdm(df_games["season"].unique()):
    try:
        url = f"https://en.wikipedia.org/wiki/{season[:4]}%E2%80%93{season[-2:]}_Premier_League"
        tables = pd.read_html(url)
    except:
        url = f"https://en.wikipedia.org/wiki/{season[:4]}%E2%80%93{season[-2:]}_FA_Premier_League"
        tables = pd.read_html(url)
    
    for wiki_table in tables:
        # I want the ranking table so I check the one with this column (there is only on table)
        if "Qualification or relegation" in wiki_table.columns:
            wiki_table["Pos"] = wiki_table["Pos"].astype(str)
            wiki_table = wiki_table[wiki_table["Pos"].str.isdigit()]
            qualifs = wiki_table["Qualification or relegation"]
            qualifs = qualifs.str.lower().fillna("").to_list()
            
            qual_releg[season] = qualifs
100%|██████████| 29/29 [00:24<00:00,  1.16it/s]

Creating a ranking dictionary To have all the rankings from each season in one place, I'm going to store in a dictionary each season as a key and each ranking (pd.DataFrame) as a value.

In [ ]:
# Defining the interesting columns
sub_df = df_games[["season", "home_team", "away_team", "home_score", "away_score"]]

champs = {}

# Looping through every season
for season in tqdm(sub_df["season"].unique()):
    year_df = sub_df[sub_df["season"]==season].reset_index(drop=True)
    teams = year_df["home_team"].unique()
    ranking = pd.DataFrame(index = teams, columns = ["points", "W", "D", "L", "GF", "GA", "GD"])
    ranking.fillna(0, inplace=True)

    for index in range(len(year_df)):
        htg = year_df.loc[index, "home_score"] # home team goals
        atg = year_df.loc[index, "away_score"]
        
        ht = year_df.loc[index, "home_team"] # home team
        at = year_df.loc[index, "away_team"]
        
        if htg > atg:
            ranking.loc[ht, "points"] += 3 
            ranking.loc[ht, "W"] += 1 
            ranking.loc[at, "L"] += 1 
            
        elif htg < atg:
            ranking.loc[at, "points"] += 3
            ranking.loc[at, "W"] += 1
            ranking.loc[ht, "L"] += 1 
            
        else:
            ranking.loc[at, "points"] += 1
            ranking.loc[ht, "points"] += 1
            ranking.loc[[ht, at], "D"] += 1

        # Goals for 
        ranking.loc[ht, "GF"] += htg
        ranking.loc[at, "GF"] += atg
        
        # Goals against
        ranking.loc[ht, "GA"] += atg
        ranking.loc[at, "GA"] += htg
        
        # Goals difference
        ranking.loc[ht, "GD"] += htg - atg
        ranking.loc[at, "GD"] += atg - htg
    
        
    ranking["played"] = ranking["W"] + ranking["D"] + ranking["L"]
    ranking = ranking.sort_values(["points", "GD"], ascending=False)
    ranking.insert(loc = 0,
                   column = 'team',
                   value = ranking.index)
    
    # The index being the rank of the team
    ranking.index = list(range(1, 23)) if season in ["1992/93", "1993/94", "1994/95"] else list(range(1, 21))
    
    try:
        ranking["qual/releg"] = qual_releg[season]
    except:
        print(qual_releg[season])
        display(ranking)
    
    champs[season] = ranking
100%|██████████| 29/29 [00:03<00:00,  7.41it/s]

Tightest season¶

Which season was the tightest? How many points/goals of difference between the top 5 or even top 2?

In [ ]:
top_2 = {}

for season, ranking in champs.items():
    # The 7 top teams may play UCL/UEL
    points_top7 = ranking.loc[list(range(1, 8)), "points"].to_list()
    point_diff = [points_top7[x] - points_top7[x+1] for x in range(len(points_top7)-1)]
    
    top_2[season] = point_diff[0]
    

    
year, diff = sorted(top_2.items(), key=lambda x: x[1])[0]

print(f"The season {year} was the tightest with {diff} points of difference!")
# cap11_12 = capitalize(champs[year].copy())
ranking_11_12 = table(capitalize(champs[year].copy()).iloc[:, :-1].head(2), 60)
ranking_11_12
# py.plot(ranking_11_12)
The season 2011/12 was the tightest with 0 points of difference!

That season was crazy! Imagine, after 38 games, to teams have the exact same number of points. If you look at the ranking table, they actually had the exact same number of wins, draws and looses. Manchester City won because of a difference of 8 goals. This is crazy and almost never happens but where did MU lost points?

In [ ]:
# Subsetting by the year
sub_df = df_games[df_games["season"] == year][["home_team", "home_score", "away_score", "away_team",  
                                               "outcome", "home_htscore", "away_htscore", 
                                               "home_team_abbr", "away_team_abbr", "stadium", "match_id"]]

# Subsetting by MMU playing home AND outcome being A (away) or D (Draw) or
#               MMU playing away AND outcome being H (home) or D (Draw)
print("Manchest United could have won the title if they would have taken points on those games: ")
mu_looses = sub_df[(sub_df["home_team_abbr"].str.contains("mun") & sub_df["outcome"].str.contains("|".join(["a", "d"]))) |\
       (sub_df["away_team_abbr"].str.contains("mun") & sub_df["outcome"].str.contains("|".join(["h", "d"])))]


mu_looses
Manchest United could have won the title if they would have taken points on those games: 
Out[ ]:
home_team home_score away_score away_team outcome home_htscore away_htscore home_team_abbr away_team_abbr stadium match_id
233 manchester united 2 3 blackburn rovers a 0 1 mun blb old trafford 7650
2186 stoke city 1 1 manchester united d 0 1 stk mun bet365 stadium 7521
2202 liverpool 1 1 manchester united d 0 0 liv mun anfield 7537
5101 manchester united 1 6 manchester city a 0 1 mun mci old trafford 7554
5567 manchester united 4 4 everton d 1 1 mun eve old trafford 7810
5673 wigan athletic 1 0 manchester united h 0 0 wig mun dw stadium 7794
6746 manchester city 1 0 manchester united h 1 0 mci mun etihad stadium 7822
7264 manchester united 1 1 newcastle united d 0 0 mun new old trafford 7589
8741 newcastle united 3 0 manchester united h 1 0 new mun st james' park 7665
9070 chelsea 3 3 manchester united d 1 0 che mun stamford bridge 7704

They did loose twice against Man City with a lost of 6-1 at Old Trafford (MU's stadium).

This is quite surprising and as said, never happens but wait, there is more.

In [ ]:
last_game_id = df_games[(df_games["season"]=="2011/12") & (df_games["home_team_abbr"]=="mci")]\
                       .sort_values("kickoff_dt").tail(1)["match_id"].squeeze()

sub_df = df_events[["clock_label", "home_team_score", "away_team_score", "match_id", "type"]]
sub_df[(sub_df["match_id"]==last_game_id) & (sub_df["type"]=="goal")]\
      .rename(columns={"score_homeScore":"Manchester City", "score_awayScore":"Queens Park Rangers"})\
      .drop(["match_id", "type"], axis=1)
Out[ ]:
clock_label home_team_score away_team_score
41311 39 1 0
41316 48 1 1
41320 66 1 2
41328 90 2 2
41329 90 3 2

Yes, what you see is right, on a season that lasted more than 9 months, Manchester United lost the title on the last game, four minutes before the end. Tragedy.

You can watch the last 10 minutes of the game and how the players and fans go crazy when Agüero scores the last goal in the added time.

One more thing. These two teams WALKED on the Premier League this year with a difference of 19 points from the third team. 19 points is the equivalent of 6 wins plus 1 draw where the other lost 7 games. Champions.

In [ ]:
for season, ranking in champs.items():
    if ranking.iloc[-3, 1] ==49:
        print(season)
1992/93
In [ ]:
#######
# Points of difference fro meach team 
year = "2011/12"
points = list(champs[year]["points"])
values = [0] + [points[x] - points[x+1] for x in range(len(points)-1)][::-1]
teams = champs[year]["team"][::-1].to_list()

waterfall = go.Figure(go.Waterfall(
    name = "20", orientation = "v",
    x = list(map(lambda x: x.title(), teams)),
    textposition = "outside",
    text = list(map(lambda x: "+" + str(x) if x > 0 else str(x), values)),
    y = values))

waterfall.update_layout(
        title = f"Points of difference with the team ranked after itself. Season: {year} - Winner: {teams[-1]}")

waterfall.update_yaxes(title_text="Points difference")
waterfall.update_xaxes(title_text="Team", tickangle=-90)
waterfall.update_layout(height=650)

# py.plot(waterfall)
waterfall.show()

There were only 6 points of difference between the three next. Incredible.

Most titles since 95¶

Okay, Manchester United lost against their best rival but did they win more Premier Leagues in the last 26 years?

In [ ]:
# Creating a new dataframe 
df_podiums = pd.DataFrame(columns=["team", "place", "season"])

for season, ranking in champs.items():
    top3 = ranking.iloc[:3, ]
    
    season = "19"+season[-2:] if season[0]=="1" else "20"+season[-2:]
    season = "2000" if season == "1900" else season

    for index, team in enumerate(top3["team"]):
        if any([team, index+1]==x for x in df_podiums[["team", "place"]].values.tolist()):
            df_podiums.loc[(df_podiums.team==team) & (df_podiums.place==index+1), "season"] += ", "+season
            
        else:
            df_podiums.loc[len(df_podiums)] = [team, index+1, season]
        
        
df_podiums["count"] = df_podiums["season"].apply(lambda x: x.count(",")+1)

df_podiums = df_podiums.sort_values(["count", "place"], ascending=[0, 1])
df_podiums["place"] = df_podiums["place"].apply(lambda x: \
                                        "Champion" if x==1 else("Runner-up" if x==2 else "2nd runner-up (3rd)"))
        
    
    
df_podiums = df_podiums.sort_values("count", ascending=False)
        
df_podiums.columns = list(map(lambda x: x.title(), df_podiums.columns))

podiums = px.bar(capitalize(df_podiums), x="Team", y="Count",
                color='Place', barmode='group',
                height=500, hover_name="Season", 
                title="Number of time each team has finished at which place of the podium")

podiums.update_xaxes(title_text='Team name')
podiums.update_yaxes(title_text='Count')
# py.plot(podiums)

podiums.show()

Of course they did! Since 1995, Manchester United won 11 times the Premier League and MCI 5 times! MU didn't win since 2013 and MCI didn't (not actually true, explain why).

Special mention to Leicester, one time on the podium, one time champion. You may not realize but this had almost no chance to happen, at the beginning of the season, the odds where 5001/1 which mean that 10 Dollars bet on Leicester would have returned you 50,010 Dollars. Unpredictable.

The most, less in a season¶

In [ ]:
def minmax_season(element, min_max):
    season = ""
    variable = 0 if min_max == "max" else 1000

    for year, ranking in champs.items():
        maax = ranking[element].max()
        miin = ranking[element].min()
        
        if maax > variable and min_max == "max":
            season = year
            variable = maax
            
        elif miin < variable and min_max == "min":
#             print(variable)
            season = year
            variable = miin
            
            
    sub_df = champs[season].copy()
    print(f"Season: {season}")
    if min_max == "max": display(sub_df[sub_df[element] == sub_df[element].max()])
    elif min_max == "min": display(sub_df[sub_df[element] == sub_df[element].min()])

Even if I support neither MU neither MC, I still feel sad for MU so lets look at the greatest record and not show which team had the least points, least wins or max looses.

Max points

Again, City won the 2017/18 season with a record of three digits threshold. {find an adjective}

In [ ]:
minmax_season("points", "max")
Season: 2017/18
team points W D L GF GA GD played qual/releg
1 manchester city 100 32 4 2 106 27 79 38 qualification for the champions league group s...

Did they break the record for the most wins in a season though?

Max wins

In [ ]:
minmax_season("W", "max")
Season: 2018/19
team points W D L GF GA GD played qual/releg
1 manchester city 98 32 2 4 95 23 72 38 qualification for the champions league group s...

Of course they did...

Less looses

I know that City doesn't have this record. Even if they lost only two games in the year they broke the points record, Arsenal did something fantastic more than a decade before.

In [ ]:
minmax_season("L", "min")
Season: 2003/04
team points W D L GF GA GD played qual/releg
1 arsenal 90 26 12 0 73 26 47 38 qualification for the champions league group s...

If you know a little bit about football or follow it recently, you know that Arsenal has become a meme for its weak performances. This former great team, part of the BIG 6, had some difficulty recently but did one thing that will stay in history forever. Arsenal did not loose a single match during the season 2003/04 and for that they had the greatest price: a golden Premier League trophy. The photo below is showing Arsene Wenger, Arsenal's former manager lifting the golden tropy. He served the team for 22 years, 1235 games (707W - 280D - 248L). When you see that managers now rarely stay more than four or five years, it really makes you think about the past and realize tat football has drastically changed.

drawing

Least goals against

Which goalkeeper has been amazing?

In [ ]:
minmax_season("GA", "min")
Season: 2004/05
team points W D L GF GA GD played qual/releg
1 chelsea 95 29 8 1 72 15 57 38 qualification for the champions league group s...

Jesus! In 04/05 Chelsea took only 15 goals! Who played in defense and who was the goalkeeper?

In [ ]:
# Defining Chelsea's id
chelsea_id = df_games[df_games["home_team"]=="chelsea"]["home_team_id"].mean()
print(chelsea_id)

# Games id in 2004/05
gamesid_0405 = df_games[df_games["season"]=="2004/05"]["match_id"]

# Chealsea 2004/05 players
sub_df = df_players[(df_players["match_id"].isin(gamesid_0405)) & \
                    (df_players["team_id"]==chelsea_id)]

# display(sub_df)
sub_df = sub_df.groupby(["player_name", "normal_position", "player_id"]).sum()["time_pitch"].to_frame().reset_index()
sub_df[sub_df["normal_position"]=="goalkeeper"]
4.0
Out[ ]:
player_name normal_position player_id time_pitch
4 carlo cudicini goalkeeper 1676 262.0
17 lenny pidgeley goalkeeper 2231 0.0
22 petr cech goalkeeper 2651 3150.0

Three goalkeepers played for Chelsea during this season: Petr Chech 92% of the time and Carlo Cudicini the rest of the time with Lenny Pidgeley who only played 23 minutes.

In [ ]:
# Retrieving all chelsea players
chelsea_playersid = df_players[df_players["team_id"]==chelsea_id]["player_id"].unique()

# All games Petr Chech played 
petr_gameid = df_players[(df_players["player_id"]==2651) &\
                         (df_players["match_id"].isin(gamesid_0405))]\
                        ["match_id"]

# # All games total goals for opponant
apponents_goals = df_events[(df_events["type"]=="goal") &\
                            (df_events["match_id"].isin(petr_gameid)) &\
                            (~df_events["player_id"].isin(chelsea_playersid))]

# display(apponents_goals)


print(f"Games played: {len(petr_gameid)}")
print(f"Clean sheets: {len(petr_gameid) - apponents_goals.shape[0]}")
Games played: 35
Clean sheets: 23

Over 35 games (90mn * 35 = 3,150 time_pitch from the precedent output), Petr Cech did 23 clean sheets. I'm honestly a little bit disappointed as I thought he was going to have a better ratio (~80%) but we can see that he never took more than 2 goals so it makes sense! Legend.

Team ranking evolution¶

In [ ]:
sub_df = pd.DataFrame(columns = ["season", "team", "rank"])

for season, ranking in champs.items():
    for rank in ranking.index:
        sub_df.loc[len(sub_df)] = [season, ranking.loc[rank, "team"], int(rank)]


teams = ["manchester united", "liverpool", "arsenal", 
         "tottenham hotspur", "chelsea", "manchester city"]

sub_df = sub_df[sub_df["team"].isin(teams)]
sub_df["rank"] = sub_df["rank"].astype(int)
In [ ]:
# sub_df["rank"] = sub_df["rank"].apply(lambda x: 10 + (10-x))

sub_df = sub_df.sort_values("season")
sub_df.season.unique()
Out[ ]:
array(['1992/93', '1993/94', '1994/95', '1995/96', '1996/97', '1997/98',
       '1998/99', '1999/00', '2000/01', '2001/02', '2002/03', '2003/04',
       '2004/05', '2005/06', '2006/07', '2007/08', '2008/09', '2009/10',
       '2010/11', '2011/12', '2012/13', '2013/14', '2014/15', '2015/16',
       '2016/17', '2017/18', '2018/19', '2019/20', '2020/21'],
      dtype=object)
In [ ]:
big6 = ["manchester city", "manchester united", "liverpool", "tottenham hotspur", "chelsea", "arsenal"]
In [ ]:
 
In [ ]:
df_points = pd.DataFrame(columns = df_games["home_team"].unique(),
                         index = df_games["season"].unique())

for season, ranking in champs.items():
    for team in ranking["team"]:
#         points = ranking.loc[ranking.team==team, "points"].squeeze()
        df_points.loc[season, team] = ranking["team"].to_list().index(team) + 1

        

# df_points = df_points.fillna(0) 
df_points = df_points[big6]
df_points


# # plotly
fig = go.Figure()
colors = px.colors.qualitative.Plotly 
df = df_points
# set up multiple traces
for col in df.columns:
    fig.add_trace(go.Scatter(x=df.index,
                             y=df[col],
                             name  = col,
                             visible=True, 
                            )
                 )

um = [ {} for _ in range(len(df.columns)) ]
buttons = []
menuadjustment = 0.10

buttonX = -0.05
buttonY = 1 + menuadjustment
for i, col in enumerate(df.columns):
    button = dict(method='restyle',
                  label=col,
                  visible=True,
                  args=[{'visible':True,
                         'line.color' : colors[i]}, [i]],
                  args2 = [{'visible': False,
                            'line.color' : colors[i]}, [i]],
                 )
    
    # adjust some button features
    buttonY = buttonY-menuadjustment
    um[i]['buttons'] = [button]
    um[i]['showactive'] = False
    um[i]['y'] = buttonY
    um[i]['x'] = buttonX
    
    
    
    

# add a button to toggle all traces on and off
button2 = dict(method='restyle',
               label='All',
               visible=True,
               args=[{'visible':True}],
               args2 = [{'visible': False}],
               )

# assign button2 to an updatemenu and make some adjustments
um.append(dict())
um[i+1]['buttons'] = [button2]
um[i+1]['showactive'] = True
um[i+1]['y']=buttonY - menuadjustment
um[i+1]['x'] = buttonX
    
# add dropdown menus to the figure
# print(um)
fig.update_layout(title="Ranking of the Big 6 per year", showlegend=True, updatemenus=um)

# adjust button type
for m in fig.layout.updatemenus:
    m['type'] = 'buttons'

fig['layout']['yaxis']['autorange'] = "reversed"


lst = list(range(1, 21))
fig.update_layout(
    yaxis = dict(
        tickmode = 'array',
        tickvals = lst[::2],
        ticktext = list(map(lambda x: str(x), lst))[::2]
    )
)


fig.update_xaxes(title_text="Season", tickangle=-90)

# py.plot(fig)

fig.show()
In [ ]:
df_games["season"].value_counts()
Out[ ]:
1992/93    462
1993/94    462
1994/95    462
2013/14    380
2017/18    380
2006/07    380
1998/99    380
2002/03    380
2014/15    380
2009/10    380
1996/97    380
2020/21    380
2016/17    380
2011/12    380
2012/13    380
2004/05    380
2007/08    380
1999/00    380
2015/16    380
2019/20    380
2008/09    380
2018/19    380
2001/02    380
2005/06    380
1997/98    380
2003/04    380
2010/11    380
1995/96    380
2000/01    380
Name: season, dtype: int64

Deep explanation of what happened

Ranking facts¶

here some quick facts, information where you may or may not find interest in but...

Avg & max nb or points for relegation

In [ ]:
points_releg = []
releg_teams = []
for season, ranking in champs.items():
    releg = "relegation to the football league first division"
    releg_ranking = ranking[ranking["qual/releg"].str.contains("releg")]
    points_releg += releg_ranking["points"].to_list()
    releg_teams.extend(releg_ranking["team"].to_list())    
        
print(f"""The average points to be relagated is {np.mean(points_releg):.0f} points.
The maximum that ever happened was {max(points_releg)}.""")
The average points to be relagated is 32 points.
The maximum that ever happened was 49.
In [ ]:
releg_count = pd.DataFrame({i:[releg_teams.count(i)] for i in set(releg_teams)}).T.reset_index()
releg_count.columns = ["team", "count"]
releg_count

fig = px.bar(capitalize(releg_count).sort_values("count", ascending=False), x='team', y='count', 
            title="Relagated teams count")

fig.update_xaxes(title_text="Team name", tickangle=-90)
fig.update_yaxes(title_text="Count")


fig.show()

Comeback after relegation

In [ ]:
rankings = list(champs.values())
come_back = 0
all_releg_teams = []
for index, ranking in enumerate(rankings):
    releg_teams = ranking[ranking["qual/releg"].str.contains("releg")]["team"].to_list()
    all_releg_teams += releg_teams
    
    try:
        if any(team in rankings[index+2]["team"].to_list() for team in releg_teams):
            come_back += [team in rankings[index+2]["team"].to_list() for team in releg_teams].count(True)
    
    except IndexError:...


print(f"Over the {len(set(all_releg_teams))}/{len(all_releg_teams)} teams that have been relagated, {come_back} came back the next year.") 
Over the 41/87 teams that have been relagated, 27 came back the next year.

Favorite victim¶

Which team scored many goals against a special team?

In [ ]:
df = df_games.groupby(["home_team", "away_team"]).sum()[["home_score", "away_score"]]
df = df.reset_index()

big6 = ["manchester city", "manchester united", "liverpool", "tottenham hotspur", "chelsea", "arsenal"]

df_team_vs_team_goals = pd.DataFrame(columns=big6, index=big6)
df_team_vs_team_count = pd.DataFrame(columns=big6, index=big6)


df_team_vs_team_goals.fillna(0, inplace=True)
df_team_vs_team_count.fillna(0, inplace=True)


for home_team in big6:
    for away_team in big6:
        if home_team != away_team:
            sub = df_games[(df_games.home_team==home_team) & (df_games.away_team==away_team)]
            df_team_vs_team_goals.loc[home_team, away_team] += sub["home_score"].sum()
            df_team_vs_team_goals.loc[away_team, home_team] += sub["away_score"].sum()
            df_team_vs_team_count.loc[home_team, away_team] += sub.shape[0]
            df_team_vs_team_count.loc[away_team, home_team] += sub.shape[0]
        

    
    
In [ ]:
list(df_team_vs_team_goals.columns)
Out[ ]:
['manchester city',
 'manchester united',
 'liverpool',
 'tottenham hotspur',
 'chelsea',
 'arsenal']
In [ ]:
fig = px.imshow(df_team_vs_team_goals, 
                zmin = sorted(set([x for lst in df_team_vs_team_goals.values for x in lst]))[1],
                color_continuous_scale=px.colors.sequential.Blues)
fig.show()
In [ ]:
 
In [ ]:
# df_team_vs_team_count, df_team_vs_team_goals
In [ ]:
vs_per_game = []

for index1, x in enumerate(df_team_vs_team_goals.values):
    mid = []
    for index2, y in enumerate(x):
#         print(y/df_team_vs_team_count.values[index1][index2])
        mid.append(round(y/df_team_vs_team_count.values[index1][index2], 2))
        
    vs_per_game.append(mid)
        
df_team_vs_team_goals_game = pd.DataFrame(vs_per_game, columns = df_team_vs_team_goals.columns, 
                           index = df_team_vs_team_goals.index)

df_team_vs_team_goals_game
Out[ ]:
manchester city manchester united liverpool tottenham hotspur chelsea arsenal
manchester city NaN 1.27 1.33 1.42 1.04 1.19
manchester united 1.46 NaN 1.36 1.69 1.24 1.34
liverpool 1.54 1.16 NaN 1.72 1.26 1.67
tottenham hotspur 1.40 0.98 1.14 NaN 0.95 1.24
chelsea 1.48 1.22 1.21 1.78 NaN 1.34
arsenal 1.46 1.05 1.24 1.59 1.33 NaN
In [ ]:
programmers = ['Alex','Nicole','Sara','Etienne','Chelsea','Jody','Marianne']

# base = datetime.datetime.today()
# dates = base - np.arange(180) * datetime.timedelta(days=1)
# z = np.random.poisson(size=(len(programmers), len(dates)))

# fig = go.Figure(data=go.Heatmap(
#         z=df_team_vs_team_goals_game.values,
#         x=df_team_vs_team_goals_game.index,
#         y=df_team_vs_team_goals_game.columns,
#         colorscale='Viridis'))

# fig.update_layout(
#     title='GitHub commits per day',
#     xaxis_nticks=36)

# fig.show()
# dates
In [ ]:
 

I'm

Win/draw/loose ratio¶

In [ ]:
 
In [ ]:
total_ranking = pd.DataFrame()

for season, ranking in champs.items():
    total_ranking = pd.concat([total_ranking, ranking], axis=0)
    
total_ranking = total_ranking.groupby("team").sum().reset_index(drop=False)

total_ranking["played"] = total_ranking["W"] + total_ranking["D"] + total_ranking["L"]
total_ranking["seasons"] = total_ranking["team"].apply(lambda x:\
                                [team for ranking in champs.values() for team in ranking["team"].to_list() ]\
                                .count(x))



for outcome in ["W", "D", "L"]:
    total_ranking[f"{outcome}%"] = round(total_ranking[f"{outcome}"] / total_ranking["played"] * 100, 2)
    
    
print(total_ranking.shape)
total_ranking = total_ranking.sort_values(["W", "D"], ascending=False)

total_ranking.head(7)
(49, 13)
Out[ ]:
team points W D L GF GA GD played seasons W% D% L%
26 manchester united 2308 687 247 180 2128 1009 1119 1114 29 61.67 22.17 16.16
1 arsenal 2072 597 281 236 1956 1100 856 1114 29 53.59 25.22 21.18
13 chelsea 2064 597 273 244 1897 1092 805 1114 29 53.59 24.51 21.90
24 liverpool 2017 581 274 259 1927 1121 806 1114 29 52.15 24.60 23.25
42 tottenham hotspur 1716 480 276 358 1676 1398 278 1114 29 43.09 24.78 32.14
25 manchester city 1536 444 204 276 1559 1042 517 924 24 48.05 22.08 29.87
17 everton 1535 407 314 393 1448 1415 33 1114 29 36.54 28.19 35.28
In [ ]:
table(total_ranking.round(0).head(76), line_break=True)
In [ ]:
total_ranking = total_ranking.sort_values(["W%", "D%"], ascending=False)
# I actually did 5 more minutes of research and found that. Look and the start of the commented code haha...
stacked_ratio = pd.concat([total_ranking.set_index("team")[["W%", "D%", "L%"]].stack().reset_index(),
                           total_ranking[["W", "D", "L"]].stack().reset_index(drop=True)], axis=1)
                             
# stacked_ratio.columns = ["team", "outcome", "percent", "count"]
stacked_ratio = stacked_ratio.merge(total_ranking, how="left", on="team")





stacked_ratio.columns  = ['Team', 'Outcome', 'Percent', 'Count', 'Overall points', 'Wins', 
                          'Draws', 'Looses', 'Goals for', 'Goals against', 'Goals difference', 
                          'Games played', 'Seasons played', 'Win rate', 'Draw rate', 'Loose rate']

stacked_ratio  = stacked_ratio[['Team', 'Outcome', 'Percent', 'Count', 'Games played', 'Seasons played', 
                                'Overall points', 'Wins', 
                          'Draws', 'Looses', 'Goals for', 'Goals against', 'Goals difference']]


# Formating hover_data in plot
stacked_ratio.insert(3, " ", "")
stacked_ratio.insert(8, "  ", "")
stacked_ratio.insert(12, "   ", "")

for col in stacked_ratio.columns:
    if col != 'Percent':
        stacked_ratio[col] = stacked_ratio[col].apply(lambda x: " "+str(x))





# Changing colors of variables
color=["#00afb9", "#fdfcdc", "#F24333"]
diff = [" W%", " D%", " L%"]
colors = {}

for index, x in enumerate(color): 
    colors[diff[index]] = f"rgb{str(px.colors.hex_to_rgb(x))}"


fig = px.bar(capitalize(stacked_ratio), y="Percent", x="Team", color="Outcome", hover_name="Team",
             color_discrete_map=colors, title="Ratio Win, Draw, Lost in % - All seasons included", 
            template="plotly_white", 
            hover_data=stacked_ratio.columns.drop("Count"))
    
fig.update_xaxes(title_text="Team", tickangle=-90)
fig.update_yaxes(title_text="Percentage")
fig.update_layout(legend_title="Outcome", height=700)

# py.plot(fig)
fig.show()

Manchester Unites is on another planet. Being one of the 5 team that is in PL since 1995, it's also the team with the highest win ratio, 62%. Machine.

I told you before, you can make fun of Arsenal today, but over the last 26 years, this team did an astonishing 559 wins.

I cannot believe that! Arsenal and Chelsea, two ennemies have THE EXACT SAME NUMBER of wins. Over the 1114 games played over 29 years, in Premier League, they both won 597 times and both rank at the second place but Arsenal takes the advantages with slightly more draws. Let's dive into it to see the difference over the years.

In [ ]:
df_chelsea_arsenal = pd.DataFrame(columns=["season", "team", "points"])

cpoints = 0
apoints = 0
for season, ranking in champs.items():
    cpoints += ranking[ranking["team"]=="chelsea"]["W"].squeeze()
    apoints += ranking[ranking["team"]=="arsenal"]["W"].squeeze()
    
    df_chelsea_arsenal.loc[len(df_chelsea_arsenal)] = [season, "chelsea", cpoints]
    df_chelsea_arsenal.loc[len(df_chelsea_arsenal)] = [season, "arsenal", apoints]
    
    
fig = px.line(df_chelsea_arsenal, x="season", y="points", 
              color="team", width=1000, height=400, 
             title="Cumulated points")

fig.update_layout(legend_title="Team")
fig.update_xaxes(title="Season")
fig.update_yaxes(title="Cumulated points")

# py.plot(fig)
fig.show()

I think in term of crazy insight, I have quite some in this analysis. In 1992/93, Chelsea and Arsenal respectively finished the first season of Premier League at 14 and 15 points. Next season will be the 30th edition of the PRemier league so maybe Chelsea, will finally pass its historical ennemy.

Big 6 vs everybody elese¶

The best six teams being: blablabla

Is is that good? how are they performing together vs the rest of the league.? When did the big 6 really became the big 6?

In [ ]:
ranking
Out[ ]:
team points W D L GF GA GD played qual/releg
1 manchester united 80 24 8 6 79 31 48 38 qualification for the champions league first g...
2 arsenal 70 20 10 8 63 38 25 38 qualification for the champions league first g...
3 liverpool 69 20 9 9 71 39 32 38 qualification for the champions league third q...
4 leeds united 68 20 8 10 64 43 21 38 qualification for the uefa cup first round[a]
5 ipswich town 66 20 6 12 57 42 15 38 qualification for the uefa cup first round[a]
6 chelsea 61 17 10 11 68 45 23 38 qualification for the uefa cup first round[a]
7 sunderland 57 15 12 11 46 41 5 38
8 aston villa 54 13 15 10 46 43 3 38 qualification for the intertoto cup third round
9 charlton athletic 52 14 10 14 50 57 -7 38
10 southampton 52 14 10 14 40 48 -8 38
11 newcastle united 51 14 9 15 44 50 -6 38 qualification for the intertoto cup third round
12 tottenham hotspur 49 13 10 15 47 54 -7 38
13 leicester city 48 14 6 18 39 51 -12 38
14 middlesbrough 42 9 15 14 44 44 0 38
15 west ham united 42 10 12 16 45 50 -5 38
16 everton 42 11 9 18 45 59 -14 38
17 derby county 42 10 12 16 37 59 -22 38
18 manchester city 34 8 10 20 41 65 -24 38 relegation to the football league first division
19 coventry city 34 8 10 20 36 63 -27 38 relegation to the football league first division
20 bradford city 26 5 11 22 30 70 -40 38 relegation to the football league first division
In [ ]:
# df_big6_points
In [ ]:
df_big6_points = pd.DataFrame(columns=["season", "type", "points", "team"])

for season, ranking in champs.items():
    b6_wins = ranking[ranking["team"].isin(big6)]["W"].sum() / ranking["W"].sum() * 100
    b6_points = ranking[ranking["team"].isin(big6)]["points"].sum() / ranking["points"].sum() * 100
    
    df_big6_points.loc[len(df_big6_points)] = [season, "wins", b6_points, "B6"]
    df_big6_points.loc[len(df_big6_points)] = [season, "wins", 100-b6_points, "Others"]
#     df_big6_points.loc[len(df_big6_points)] = [season, "points", b6_points, "B6"]
#     df_big6_points.loc[len(df_big6_points)] = [season, "points", 100-b6_points, "Others"]
    
    
    
fig = px.bar(df_big6_points, y="points", x="season", color="team", barmode='group', 
             title="Distribution of points big 6 vs other teams (in %)")
fig.update_xaxes(tickangle=-90)

fig.update_xaxes(title_text="Season", tickangle=-90)
fig.update_yaxes(title_text="Percentage")
fig.update(layout_coloraxis_showscale=False)

# py.plot(fig)
fig.show()

At the beggining of the Premier League, the Big 6 was representing about 30% of the points and wins, which for a competition of 22 teams is fine. Yes, Manchester city, was not in PL from 95 to 00 but still, after they c

Before the Big 6 reigned on the England football, a 'top four' of dominant teams emerged. You saw the previous graph, Liverpool, Manchester City, Arsenal and Chelsea were perennial contenders in the division.

Games stats¶

In [ ]:
df_events[df_events["description"]=="yellow_card"]["clock_label"].value_counts()
Out[ ]:
89    10354
90     2183
45      873
44      486
88      459
      ...  
2        50
5        50
3        48
1        33
0         5
Name: clock_label, Length: 91, dtype: int64

Minutes¶

Events distribution¶

In [ ]:
df_events["mn"] = df_events["clock_label"] + df_events["add_time"]
In [ ]:
mn_perhalf = []
for x in tqdm(range(len(df_events))):
    data = df_events.loc[x, ].to_list()
    if data[1] == 1:
        mn_perhalf.append(data[0]+data[-1])
        
    else:
        mn_perhalf.append(data[0]+data[-1]-45)
        
df_events["mn_perhalf"] = mn_perhalf
100%|██████████| 214429/214429 [00:12<00:00, 17054.77it/s]
In [ ]:
events = {"yellow_card":"Yellow card", "red_card":"Red card", "2y_red_card":"2nd yellow --> red card", 
          "goal":"Goal", "penalty":"Penalty", "sub_on":"Substitution"}

df_events.replace(events)
Out[ ]:
clock_label phase type home_team_score away_team_score player_id team_id assist_id description match_id add_time mn mn_perhalf
0 0 1 play_start 0 0 NaN NaN NaN NaN 1875 0 0 0
1 5 1 Goal 0 1 1037.0 7.0 179.0 Goal 1875 0 5 10
2 45 1 play_end 0 1 NaN NaN NaN NaN 1875 0 45 90
3 45 2 play_start 0 1 NaN NaN NaN NaN 1875 0 45 45
4 64 2 substitution 0 1 1087.0 15.0 NaN Substitution 1875 0 64 83
... ... ... ... ... ... ... ... ... ... ... ... ... ...
214424 90 2 play_end 2 2 NaN NaN NaN NaN 8190 0 90 135
214425 0 1 play_start 0 0 NaN NaN NaN NaN 8191 0 0 0
214426 45 1 play_end 0 1 NaN NaN NaN NaN 8191 0 45 90
214427 45 2 play_start 0 1 NaN NaN NaN NaN 8191 0 45 45
214428 90 2 play_end 0 1 NaN NaN NaN NaN 8191 0 90 135

214429 rows × 13 columns

In [ ]:
events = {"yellow_card":"Yellow card", "red_card":"Red card", "2y_red_card":"2nd yellow --> red card", 
          "goal":"Goal", "penalty":"Penalty"}

sub_df = df_events.replace(events)
events = list(events.values())[::-1]

data = []
for event in tqdm(events):
    to_add = []
    for clock_time in sub_df[sub_df["description"]==event]["mn"]:
        to_add.append(clock_time)
    
    data.append(to_add)
    

colors = "FFE548 FF2E1F F55A00 000000 4F6D7A"
colors = ["#"+x for x in colors.split(" ")][::-1]

distri = go.Figure()
count = 0
for data_line, color in zip(data, colors):
    distri.add_trace(go.Violin(x=data_line[::-1], 
                            line_color=color,
                               
                            bandwidth=0.5, 
                           name = events[count]))
    count += 1

distri.update_traces(side='positive', width=2, points=False)
distri.update_layout(xaxis_showgrid=False, xaxis_zeroline=True)
distri.update_xaxes(title_text="Minutes in the game")
distri.update_yaxes(title_text="Event")
distri.update(layout_coloraxis_showscale=False)
distri.update_layout(title="Distribution of events over the minutes of the games", 
                     legend_title="Event", 
                    legend={'traceorder':'reversed'}, 
                    violingap=0.9)

# py.plot(distri)

distri.show()
100%|██████████| 5/5 [00:00<00:00, 127.54it/s]

Goals distribution¶

In [ ]:
sub_df = pd.DataFrame()
sub_df["team_id"] = df_games["home_team_id"].append(df_games["away_team_id"])
sub_df["match_id"] = df_games["match_id"].append(df_games["match_id"])
sub_df["location"] = ["home" for  x in range(int(len(sub_df)/2))] + ["away" for  x in range(int(len(sub_df)/2))]



sub_events = df_events[df_events.type.isin(["goal", "own_goal"])]
sub_df = sub_df.merge(sub_events, on=["team_id", "match_id"], how="right")[["location", "phase", "type"]]

sub_df = sub_df.groupby(["location", "phase"]).count()

total_goals = sum(sub_df.iloc[:2, 0].to_list() + sub_df.iloc[2:, 0].to_list())
sub_df["type"] = sub_df["type"].apply(lambda x: round(x/total_goals*100, 1))
sub_df
data = [sub_df.iloc[:2, 0].to_list()] + [sub_df.iloc[2:, 0].to_list()]
data



fig = px.imshow(data,
                labels=dict(x="home/away team", y="Half", color="Goals"),
                x=['Away', 'Home'],
                y=["First half", "Second half"], 
                color_continuous_scale = px.colors.sequential.Blues)

fig.update_layout(title=f"Distribution of the goals scored in % - Total goals: {total_goals:,}")

# py.plot(fig)
fig.show()

Come back¶

In football, a come back is when a team is loosing at half time but manage to win the game at the end.

In [ ]:
# df_events[~df_events["match_id"].isin(df_games["match_id"].to_list())]

missing_ids = set(df_games["match_id"].unique()) - set(df_events[df_events["type"]=="play_end"]["match_id"].unique())
missing_ids = list(missing_ids)
to_add = []
for missing_id in tqdm(missing_ids):

    sub = df_events[df_events["match_id"]==missing_id]

    # If there is an error, then there was no event meaning 0-0
    try:
        ht_score = sub[sub["clock_label"]<46].iloc[-1, 3:5]
    except IndexError: ht_score = [0, 0] 
        
    ft_score = sub.iloc[-1, 3:5]

    play_start1 = [0, 1, "play_start", 0, 0] + [np.nan]*4 + [missing_id]
    play_end1 = [45, 1, "play_end", ht_score[0], ht_score[1]] + [np.nan]*4 + [missing_id]
    play_start2 = [45, 2, "play_start", ht_score[0], ht_score[1]] + [np.nan]*4 + [missing_id]
    play_end2 = [90, 2, "play_end", ft_score[0], ft_score[1]] + [np.nan]*4 + [missing_id]

    
    to_add.extend([play_start1, play_end1, play_start2, play_end2])
    to_add
0it [00:00, ?it/s]
In [ ]:
test = df_events.copy()

sus = pd.DataFrame(to_add, columns = test.columns)

new = pd.concat([test, sus], axis = 0)

new
new[new["type"].isin(["play_end"])]
Out[ ]:
clock_label phase type home_team_score away_team_score player_id team_id assist_id description match_id add_time mn mn_perhalf
2 45 1 play_end 0 1 NaN NaN NaN NaN 1875 0 45 90
10 90 2 play_end 0 1 NaN NaN NaN NaN 1875 0 90 135
15 45 1 play_end 0 0 NaN NaN NaN NaN 14249 2 47 92
35 90 2 play_end 3 0 NaN NaN NaN NaN 14249 5 95 140
40 45 1 play_end 0 1 NaN NaN NaN NaN 9597 3 48 93
... ... ... ... ... ... ... ... ... ... ... ... ... ...
214420 90 2 play_end 1 0 NaN NaN NaN NaN 8189 0 90 135
214422 45 1 play_end 1 1 NaN NaN NaN NaN 8190 0 45 90
214424 90 2 play_end 2 2 NaN NaN NaN NaN 8190 0 90 135
214426 45 1 play_end 0 1 NaN NaN NaN NaN 8191 0 45 90
214428 90 2 play_end 0 1 NaN NaN NaN NaN 8191 0 90 135

22532 rows × 13 columns

In [ ]:
df_come_back = new[new["type"]=="play_end"][["phase", "home_team_score", "away_team_score", "match_id"]]

                                   
df_come_back["outcome"] = df_come_back[["home_team_score", "away_team_score"]]\
    .apply(lambda x: ("h" if x["home_team_score"] > x["away_team_score"] else "a") \
           if x["home_team_score"] != x["away_team_score"] else "d",
           axis=1)


cb_count = 0
hcb_count = 0
# for unique_id in df_come_back["match_id"].unique():
#     outcome = df_come_back[df_come_back["match_id"]==]["outcome"].to_list()
games_ids = []
for index, outcome in enumerate(df_come_back["outcome"]):
    if index % 2 == 0:
        outcome = df_come_back.iloc[index:index+2, -1].to_list()
    
        if "h" in outcome and "a" in outcome:
            cb_count += 1

            if outcome[1] == "h":
                hcb_count += 1
                
            games_ids.append(df_come_back.iloc[index, -2])
        
games = df_games.shape[0]
print(f"Over {games} games, there had been {cb_count} come-backs! {hcb_count/cb_count*100:.0f}% being home!")
Over 11266 games, there had been 453 come-backs! 58% being home!

Cards¶

Yellow cards effectiveness¶

Are yellow card effective? What the ratio of a player who get a yellow card vs two yellow cards. Do players play less hard after receiving one?

In [ ]:
count_yellow = df_events[df_events["description"]=="yellow_card"]["description"].count()
count_2yellow = df_events[df_events["description"]=="2y_red_card"]["description"].count()

print(f"Only {count_2yellow/count_yellow*100:.2f}% of yellow cards ({count_yellow:,}) lead to a second one ({count_2yellow:,}).")



# print(f"For {count_yellow:,} yellow cards given, ")
# only x of first card lead to a second
Only 1.03% of yellow cards (34,321) lead to a second one (355).